import time
import urllib.request
from bs4 import BeautifulSoup
import json
import re
import math
import sys

class Processor:
    
    soup = BeautifulSoup(html, "lxml")

    content_list = soup.find_all(class_=re.compile("userContentWrapper")) #用正则挑出含有特定字符串的类
    # content_list = soup.xpath('//div[contains(@class, "userContentWrapper")]')

    if len(content_list) == 0:
    	print('this page contains no content')
    	data_str = ""
    	continue

    for item in content_list:
        text_content = ''
        for sel in item.select('p'):
            text_content += sel.get_text()
        recv_data_sample['data']['content'] = text_content

        

        recv_data_sample['url'] = data['url']
        recv_data_sample['data']['url'] = data['url']
        recv_data_sample['data']['issue_time'] = item.select('abbr')[0]['title']
        recv_data_sample['data']['create_time'] = int(time.time())
        recv_data_sample['data']['update_time'] = int(time.time())
        recv_data_sample['data']['twitter_name'] = item.select('span a')[0].get_text()

        time_str = recv_data_sample['data']['issue_time'] # 时间格式是 2017年2月1日 01:19

        #年
        year = time_str[:4]

        #月
        month_pattern = r'年\d{1,2}月'
        month = re.search(month_pattern, time_str).group(0)
        month = month.replace("年","")
        month = month.replace("月","")

        #日
        date_pattern = r'月\d{1,2}日'
        date = re.search(date_pattern, time_str).group(0)
        date = date.replace("月","")
        date = date.replace("日","")

        #小时
        hour_pattern = r' \d{1,2}:'
        hour = re.search(hour_pattern,time_str).group(0)
        hour = hour.replace(" ","")
        hour = hour.replace(":","")

        #分钟
        minute_pattern = r':\d{1,2}'
        minute = re.search(minute_pattern, time_str).group(0)
        minute = minute.replace(":","")

        #秒
        second = '00'

        #生成时间戳
        time_stamp = ''

        dt = year + '-' + month + '-' + date + ' ' + hour + ':' + minute + ':' + second

        timeArray = time.strptime(dt, "%Y-%m-%d %H:%M:%S")

        time_stamp = int(time.mktime(timeArray))

        recv_data_sample['data']['issue_time'] = time_stamp
        recv_data_sample['data']['url'] = data['url'] + "?time=" + str(time_stamp)

        if time_stamp is '':
        	recv_data_sample['state'], recv_data_sample['data']['status'] = -1, -1

        #判断是不是今天的
        time_now = int(time.time())
        if time_now - time_stamp > 86400:
        	print ("time is not match,url={}".format(recv_data_sample['data']['url']))
        	data_str = ""
        	continue

        print(recv_data_sample)  # 打印一下抓取到的内容

        queue.put("twitter_result", json.dumps(recv_data_sample))  # 再插回到队列

        data_str = ""